1 Métodos

1.1 Qué hace clusterProfiler

Link vignette: https://yulab-smu.github.io/clusterProfiler-book/index.html

  • Cell marker: tendría que buscar genes representativos de monocitos.
  • Análisis de enfermedad: podría ser interesante para demostrar que son células inmunes, ya que deberían salir enfermedades del tipo leucemias.
  • Gene Ontology Analysis
  • KEGG pathways Analysis.
  • Reactome pathways Analysis.
  • Functional enrichment analysis of genomic coordinations. Funciona junto con ChIPseeker, puede ser interesante.
dataAnnotatr <- read.csv(file.path(dataPath, dirO.7, "annotatr_results_E1_0.7.csv"))

2 Análisis

2.1 Aproximación 1: cogemos todos los genes

allGenesDF <- data.frame(ENTREZID = dataAnnotatr$annot.gene_id,
                       SYMBOL = dataAnnotatr$annot.symbol)

# allGenesDF <- bitr(allGenes, fromType = "SYMBOL",
#                       toType = c("ENSEMBL", "ENTREZID"),
#                       OrgDb = org.Hs.eg.db)

2.1.1 clusterProfiler

2.1.1.1 GO Analysis

2.1.1.1.1 Over-representation analysis
ggo1 <- sapply(c("MF", "CC", "BP"), function(term) {
              enrichGO(unique(allGenesDF$ENTREZID), org.Hs.eg.db,
                             ont = term, qvalueCutoff = 0.05,
                             readable = TRUE)})
dotplot(ggo1$MF, showCategory = 20, font.size = 8)

barplot(ggo1$MF, showCategory = 20, font.size = 8)

upsetplot(ggo1$MF)

# goplot(ggo1)

Tabla resultados

DT::datatable(ggo1$MF@result)
ggo1MFsimplified <- simplify(ggo1$MF)

dotplot(ggo1MFsimplified, showCategory = 20, font.size = 8)

barplot(ggo1MFsimplified, showCategory = 20, font.size = 8)

2.1.1.2 Cell Markers analysis

Link datos: http://bio-bigdata.hrbmu.edu.cn/CellMarker

dataHumanCells <- read.delim(file.path(analysisPath, "/data/Human_cell_markers.txt"))

dataHumanCells <- dataHumanCells %>% tidyr::unite("cellMarker", tissueType, 
                                                  cancerType, cellName, sep=", ") %>% 
   dplyr::select(cellMarker, geneID) %>%
   dplyr::mutate(geneID = strsplit(geneID, ', '))

Mediante la función enricher podemos ver el enriquecimiento de genesets propios en nuestro conjunto de genes. Es la msima base que con las anotaciones GO, pero aquí estamos construyendo nosotros la base de datos.

typeCells <- enricher(unique(allGenesDF$ENTREZID), TERM2GENE = dataHumanCells,
                      pAdjustMethod = "fdr", minGSSize = 5, qvalueCutoff = 0.2)

dotplot(typeCells, showCategory = 10, font.size = 8)

barplot(typeCells, "GeneRatio", showCategory = 10, font.size = 8)

upsetplot(typeCells)

DT::datatable(typeCells@result)

Subset con células sangre

dataBloodCells <- read.delim(file.path(analysisPath, "/data/Human_cell_markers.txt"))

monocytesInfo <- unique(dataBloodCells[grepl(".*([mM]onocyte)", dataBloodCells$cellName),]$tissueType)

dataBloodCells <- dataBloodCells %>% filter(tissueType %in% monocytesInfo) %>% 
                    tidyr::unite("cellMarker", tissueType, cancerType, cellName, sep=", ") %>% 
                    dplyr::select(cellMarker, geneID) %>%
                    dplyr::mutate(geneID = strsplit(geneID, ', '))

Con el subset no es capaz de sacar genesets significativos, ya que probablemente los genes que comparten sean muy parecidos entre todos los grupos.

typeCellsBlood <- enricher(unique(allGenesDF$ENTREZID), TERM2GENE = dataBloodCells,
                      pAdjustMethod = "fdr", minGSSize = 5, qvalueCutoff = 0.2)

dotplot(typeCellsBlood, showCategory = 10, font.size = 8)

barplot(typeCellsBlood, "GeneRatio", showCategory = 10, font.size = 8)

DT::datatable(typeCellsBlood@result)

2.1.1.3 MSigDB analysis (C8)

Es la base de datos de GSEA, tienen un apartado específico para células inmunes. Está bien que las entradas salgan relacionadas con monocitos, ya que estamos sesgando mucho el análisis por hacer enriquecimiento en una base de datos donde solo hay células inmunes.

Link paper: https://www.cell.com/immunity/fulltext/S1074-7613(15)00532-4

geneImmSginature <- msigdbr(species = "Homo sapiens", category = "C7") %>% 
  dplyr::select(gs_name, entrez_gene)

immCells <- enricher(unique(allGenesDF$ENTREZID), TERM2GENE = geneImmSginature)
dotplot(immCells, showCategory = 10, font.size = 6)

barplot(immCells, "GeneRatio", showCategory = 10, font.size = 6)

upsetplot(immCells)

2.1.1.4 Enrichment analysis based on the DisGeNET

dso1 <- enrichDGN(allGenesDF$ENTREZID, readable = TRUE)
dotplot(dso1, showCategory = 20, font.size = 8)

barplot(dso1, showCategory = 20, font.size = 8)

upsetplot(dso1)

do1 <- enrichDO(allGenesDF$ENTREZID,
                 readable = TRUE)
dotplot(do1, showCategory = 20, font.size = 8)

barplot(do1, showCategory = 20, font.size = 8)

upsetplot(do1)

2.1.1.5 KEGG Analysis

2.1.1.5.1 Con clusterProfiler
kegg1 <- enrichKEGG(gene = allGenesDF$ENTREZID,
                    organism = "hsa")
dotplot(kegg1, showCategory = 20, font.size = 8)

barplot(kegg1, showCategory = 20, font.size = 8)

upsetplot(kegg1)

2.1.1.5.2 Con KEGGGprofile
pho_KEGGresult <- find_enriched_pathway(unique(allGenesDF$ENTREZID), 
                                        species = 'hsa')

DT::datatable(pho_KEGGresult[[1]])

2.1.1.6 Functional enrichment analysis of genomic coordinations

2.2 Aproximación 2: coger genes con anotaciones en promotores

genesPromoter <- dataAnnotatr %>% filter(grepl("^(promoter).*", annot.id),
                                         !is.na(annot.gene_id))
genesPromoterDF <- data.frame(ENTREZID = genesPromoter$annot.gene_id,
                              SYMBOL = genesPromoter$annot.symbol)
 
# genesPromoterDF <- bitr(genesPromoter, fromType = "SYMBOL",
#                       toType = c("ENSEMBL", "ENTREZID"),
#                       OrgDb = org.Hs.eg.db)

2.2.1 clusterProfiler

2.2.1.1 GO Analysis

2.2.1.1.1 Over-representation analysis
ggo2 <- sapply(c(c("MF", "CC", "BP")), function(x) {
  enrichGO(unique(genesPromoterDF$ENTREZID), org.Hs.eg.db,
                 ont = "MF", qvalueCutoff = 0.05,
                 readable = TRUE)}
  )
dotplot(ggo2$MF, showCategory = 20, font.size = 8)

barplot(ggo2$MF, showCategory = 20, font.size = 8)

upsetplot(ggo2$MF)

goplot(ggo2$MF)

DT::datatable(ggo2$MF@result)

Para evitar la redundancia de los términos, el paquete te permite limpiar entradas redundantes. Pasa de 1014 a 91 términos enriquecidos.

ggo2MFsimplified <- simplify(ggo2$MF)

dotplot(ggo2MFsimplified, showCategory = 20, font.size = 8)

barplot(ggo2MFsimplified, showCategory = 20, font.size = 8)

2.2.1.1.2 GO Semantic Similarity Analysis

Ejemplo

hsGO <- godata('org.Hs.eg.db', ont="MF")

distan <- mgeneSim(head(genesPromoterDF$ENTREZID, n = 20), semData = hsGO, 
                   measure = "Wang", combine = "BMA")

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |======================================================================| 100%
pheatmap::pheatmap(distan)

2.2.1.2 Cell Markers analysis

https://rdrr.io/bioc/enrichplot/src/R/dotplot.R

typeCellsPromoters <- enricher(unique(genesPromoterDF$ENTREZID), TERM2GENE = dataHumanCells,
                      pAdjustMethod = "fdr", minGSSize = 5, qvalueCutoff = 0.2)

dotplot(typeCellsPromoters, showCategory = 50, font.size = 8)

barplot(typeCellsPromoters, showCategory = 50, font.size = 8)

DT::datatable(typeCellsPromoters@result)

2.2.1.3 Enrichment analysis based on the DisGeNET

dso2 <- enrichDGN(genesPromoterDF$ENTREZID,
                 readable = TRUE)
dotplot(dso2, showCategory = 20, font.size = 8)

barplot(dso2, showCategory = 20, font.size = 8)

upsetplot(dso2)

enrichplot::pmcplot(head(dso2$Description), 2012:2019)

2.3 Aproximación 3: genes left join con monocitos-iPS

2.3.1 Anotación del bed

if (file.exists(file.path(outputPath, "dm_annotated.rds")) &
    file.exists(file.path(outputPath, "dm_random_annotated.rds"))){
  
  dm_annotated <- readRDS(file.path(outputPath, "dm_annotated.rds"))
  dm_random_annotated <- readRDS(file.path(outputPath, "dm_random_annotated.rds"))
  df_dm_annotated <- data.frame(dm_annotated)
  
} else {
  
  regionsInter <- read_regions(con = file.path(analysisPath, "data/intersect_iPScells/solo_monocitos.bed"), 
                             genome = 'hg19',
                             format = 'bed')

  annots <- c('hg19_genes_cds', 'hg19_basicgenes', 'hg19_cpgs', 'hg19_genes_intergenic',
             'hg19_enhancers_fantom')
  
  annotations <- build_annotations(genome = 'hg19', annotations = annots)
  # Intersect the regions we read in with the annotations
  dm_annotated <- annotate_regions(regions = regionsInter, 
                                   annotations = annotations,
                                   minoverlap = 100L, #Overlap of annotations, f=-0.5
                                   ignore.strand = TRUE,
                                   quiet = FALSE)
  
  df_dm_annotated <- data.frame(dm_annotated)
  
  # Randomize the input regions
  dm_random_regions <- randomize_regions(regions = regionsInter,
                                        allow.overlaps = TRUE,
                                        per.chromosome = TRUE)
  
  # Annotate the random regions using the same annotations as above
  # These will be used in later functions
  dm_random_annotated <- annotate_regions(regions = dm_random_regions,
                                          annotations = annotations,
                                          minoverlap = 100L, #Overlap of annotations, f=-0.5
                                          ignore.strand = TRUE, quiet = TRUE)
  
  saveRDS(dm_annotated, file.path(outputPath, "dm_annotated.rds"))
  saveRDS(dm_random_annotated, file.path(outputPath, "dm_random_annotated.rds"))
}
# Find the number of regions per annotation type
dm_annsum <- summarize_annotations(annotated_regions = dm_annotated, quiet = TRUE)
print(dm_annsum)
# A tibble: 13 x 2
   annot.type                n
   <chr>                 <int>
 1 hg19_cpg_inter         3408
 2 hg19_cpg_islands       1722
 3 hg19_cpg_shelves        277
 4 hg19_cpg_shores        6401
 5 hg19_enhancers_fantom   748
 6 hg19_genes_1to5kb      2528
 7 hg19_genes_3UTRs        238
 8 hg19_genes_5UTRs        774
 9 hg19_genes_cds          490
10 hg19_genes_exons       1813
11 hg19_genes_intergenic   840
12 hg19_genes_introns     8316
13 hg19_genes_promoters   3163
# Count the occurrences of classifications in the Status
# column across the annotation types.
dm_catsum <- summarize_categorical(annotated_regions = dm_annotated,
                                  quiet = TRUE)
print(dm_catsum)
# A tibble: 30,230 x 3
# Groups:   annot.type [13]
   annot.type     annot.id        n
   <chr>          <chr>       <int>
 1 hg19_cpg_inter inter:10008     3
 2 hg19_cpg_inter inter:10011     3
 3 hg19_cpg_inter inter:10016     2
 4 hg19_cpg_inter inter:10017     1
 5 hg19_cpg_inter inter:10026     1
 6 hg19_cpg_inter inter:10032     1
 7 hg19_cpg_inter inter:10034     5
 8 hg19_cpg_inter inter:10048     4
 9 hg19_cpg_inter inter:10087     3
10 hg19_cpg_inter inter:10140     1
# … with 30,220 more rows

2.3.1.1 BED

#View a heatmap of regions occurring in pairs of annotations
annots_order <- c('hg19_genes_promoters', 'hg19_genes_5UTRs', 
                  'hg19_genes_exons', 'hg19_genes_introns',
                  'hg19_genes_3UTRs', 'hg19_genes_cds',
                  'hg19_enhancers_fantom')

dm_vs_coannotations <- plot_coannotations(annotated_regions = dm_annotated,
                                          annotation_order = annots_order, 
                                          axes_label = 'Annotations',
                                          plot_title = 'Regions in Pairs of Annotations')
print(dm_vs_coannotations)

dm_annotations_plot <- plot_annotation(annotated_regions = dm_annotated,
                annotation_order = annots_order,
                plot_title = 'Annotations for monocytes-iPCs',
                x_label = 'Annotation type',
                y_label = 'Count')

print(dm_annotations_plot)

2.3.1.2 With random

# View the number of regions per annotation and include the annotation
# of randomized regions
dm_annotations_plot_wrandom = plot_annotation(annotated_regions = dm_annotated,
                                              annotated_random = dm_random_annotated,
                                              annotation_order = annots_order,
                                              plot_title = 'Annotations for monocytes-iPC (with rndm.)',
                                              x_label = 'Annotation type',
                                              y_label = 'Count')
print(dm_annotations_plot_wrandom)

#CpGIslands
annots_order = c('hg19_cpg_islands', 'hg19_cpg_shores',
                 'hg19_cpg_shelves', 'hg19_cpg_inter')

dm_annotations_plot_wrandom = plot_annotation(annotated_regions = dm_annotated,
                                              annotated_random = dm_random_annotated,
                                              annotation_order = annots_order,
                                              plot_title = 'Annotations for monocytes-iPC (with rndm.)', 
                                              x_label = 'Annotation type',
                                              y_label = 'Count')
print(dm_annotations_plot_wrandom)

2.4 Enrichment genes BED

genesMonoIPC <- data.frame(ENTREZID = df_dm_annotated$annot.gene_id,
                           SYMBOL = df_dm_annotated$annot.symbol)

intersectGenes <- intersect(genesMonoIPC$ENTREZID, allGenesDF$ENTREZID)

cat(">> Número de genes en E1:", length(unique(allGenesDF$ENTREZID)))
>> Número de genes en E1: 10736
cat("\n>> Número de genes en Mono-iPCs:", length(unique(genesMonoIPC$ENTREZID)))

>> Número de genes en Mono-iPCs: 4932
cat("\n>> Número de genes coincidentes:", length(intersectGenes))

>> Número de genes coincidentes: 4932

2.4.1 GO Analysis

ggo3 <- sapply(c("MF", "CC", "BP"), function(term) {
              enrichGO(unique(genesMonoIPC$ENTREZID), org.Hs.eg.db,
                             ont = term, qvalueCutoff = 0.05,
                             readable = TRUE)})
dotplot(ggo3$MF, showCategory = 20, font.size = 8)

barplot(ggo3$MF, showCategory = 20, font.size = 8)

upsetplot(ggo3$MF)

# goplot(ggo1)

2.4.1.1 KEGG Analysis

kegg3 <- enrichKEGG(unique(genesMonoIPC$ENTREZID),
                    organism = "hsa")
dotplot(kegg3, showCategory = 20, font.size = 8)

barplot(kegg3, showCategory = 20, font.size = 8)

upsetplot(kegg3)